library(data.table)
## data.table 1.14.9 IN DEVELOPMENT built 2023-02-17 18:32:02 UTC; root using 2 threads (see ?getDTthreads). Latest news: r-datatable.com
## **********
## This development version of data.table was built more than 4 weeks ago. Please update: data.table::update_dev_pkg()
## **********
library(ggplot2)
library(plyr)
library(magrittr)
library(readr)
library(stringr)
library(CellBarcode)
library(ggrepel)
library(ROCR)
library(ggsci)
theme0 <- theme_bw() + theme(
text = element_text(size = 15),
line = element_line(size = 1),
axis.line = element_line(size = 1),
axis.ticks.length = unit(3, units = "mm"),
axis.text.x = element_text(
margin = margin(t = 2, unit = "mm")
, angle = 60, vjust = 1, size = 15, hjust = 1),
axis.text.y = element_text(margin = margin(r = 3, l = 5, unit = "mm")),
legend.position = "right",
)
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
theme0_lt = theme0 + theme(legend.position = "top")
theme1 = theme0 + theme(
axis.text.x = element_text(
margin = margin(t = 2, unit = "mm")
, angle = 0, vjust = 1, size = 12, hjust = 0.5)
)
knitr::opts_chunk$set(fig.width=20, fig.height=12)
################################################################################
# Analysis target #
################################################################################
Sample info
sample_info = fread("../run_simulation_no_umi/non_umi_simu_design_matrix.tsv")
sample_info$simu_id %<>% as.character
setkey(sample_info, "simu_id")
i_level_simu_name = sample_info[order(as.integer(simu_id)), simu_name][1:26]
Clone size disbribution
true_barcode_l = read_rds("../tmp/non_umi_simulation_ref.rds")
true_barcode_l = read_rds("../run_preprocess_simulation/tmp/non_umi_simulation_ref.rds")
d_true_barcode = true_barcode_l %>% rbindlist(idcol = "simu_file")
x = d_true_barcode$simu_file %>% str_match("barcode_simulation_\\d+_simu_seq_(.*)") %>% extract(, c(2))
table(x, useNA = "always")
## x
## 1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 23 24 25 26 27 28 3 4 5 6 7 8 9 <NA>
## 8999 8998 8999 8998 8999 8999 8998 9000 8997 8998 8863 4499 8854 8999 6861 6869 6855 11971 20058 17997 35978 17989 35981 8998 8999 9000 8999 8996 0
d_true_barcode = cbind(d_true_barcode, simu_id = x)
# d_true_barcode[simu_name == "hamming" & grepl("simulation_19_", simu_file), .N, by = barcode_seq][N > 1]
# d_true_barcode[simu_name == "vdj_barcode" & grepl("simulation_1_", simu_file), .N, by = barcode_seq][N > 1]
# d_true_barcode[simu_name == "vdj_barcode"]
d_true_barcode$simu_name = sample_info[d_true_barcode$simu_id, simu_name]
table(d_true_barcode$simu_name, useNA = "always")
##
## barcode_length_10 barcode_size_mean_3_variation_1_clone_n_1200 barcode_size_mean_3_variation_1_clone_n_600 base clone_n_1200
## 8999 35978 17997 8999 35981
## clone_n_150 clone_n_600 cycle_20 cycle_40 efficiency_0.5
## 4499 17989 8996 8998 8999
## efficiency_0.9 error_1e-5 error_1e-7 hamming hamming_size_variation_3
## 8998 8999 8999 8863 8854
## ngs_profile_MSv1 pcr_read_100 pcr_read_1000 pcr_read_25 size_mean_0.6
## 8998 9000 8997 8998 8998
## size_mean_3 size_variation_1 size_variation_3 vdj_barcode vdj_barcode_size_mean_3_variation_1_clone_n_1200
## 8999 9000 8999 6861 20058
## vdj_barcode_size_mean_3_variation_1_clone_n_600 vdj_barcode_size_variation_1 vdj_barcode_size_variation_3 <NA>
## 11971 6869 6855 0
d_true_barcode = rename(d_true_barcode, c("seq" = "barcode_seq"))
Barcode frequency
ggplot(d_true_barcode[, .(freq = sum(freq)), by = .(simu_file, barcode_seq, simu_name)]) + aes(x = simu_name, y = freq) +
geom_boxplot() +
# geom_jitter(alpha = 0.2) +
theme0 + scale_y_log10() +
scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 53975 rows containing missing values (`stat_boxplot()`).

AUC
# NOTE: input
d_auc = fread("./tmp/table_auc_no_umi_clustering.tsv")
d_count_true_barcode = fread("tmp/table_count_true_barcode_clustering.tsv")
count - true barcode
ggplot(d_count_true_barcode) + aes(x = simu_name, y = count, color = factor(is_true)) +
geom_boxplot(alpha = 0.2) +
theme0_lt + scale_y_log10() + scale_color_npg() +
scale_x_discrete(limits = i_level_simu_name)
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 71827 rows containing missing values (`stat_boxplot()`).
## Warning: Removed 14543 rows containing non-finite values (`stat_boxplot()`).

AUC versus simu conditions
ggplot(d_auc) + aes(x = simu_name, y = auc) + geom_boxplot() + theme0 +
scale_x_discrete(limits = i_level_simu_name) +
labs(y = "AUC") + lims(y = c(0, 1))
## Warning: Removed 60 rows containing missing values (`stat_boxplot()`).

ggplot(d_auc) + aes(x = simu_name, y = aucpr) + geom_boxplot() + theme0 +
scale_x_discrete(limits = i_level_simu_name) +
labs(y = "P-R AUC") + lims(y = c(0, 1))
## Warning: Removed 60 rows containing missing values (`stat_boxplot()`).

Resolusion index
d_count_true_barcode = fread("tmp/table_count_true_barcode_clustering.tsv")
Apply the autothreshold strategy
# NOTE: input
d = fread("./tmp/table_no_umi_auto_threshold_predict_rate_clustering.tsv")
d_plot_pr = melt(d, id.vars = "simu_name", measure.vars=c("pre", "rec"), variable.name = "pr", value.name = "value")
## Warning in melt.data.table(d, id.vars = "simu_name", measure.vars = c("pre", : 'measure.vars' [pre, rec, ...] are not all of the same type. By order of hierarchy, the molten data value column will be of type 'double'. All measure variables not of type 'double' will be coerced too.
## Check DETAILS in ?melt.data.table for more on coercion.
d_plot_tf = melt(d, id.vars = "simu_name", measure.vars=c("tpr", "fpr"), variable.name = "pr", value.name = "value")
## Warning in melt.data.table(d, id.vars = "simu_name", measure.vars = c("tpr", : 'measure.vars' [tpr, fpr, ...] are not all of the same type. By order of hierarchy, the molten data value column will be of type 'double'. All measure variables not of type 'double' will be coerced too.
## Check DETAILS in ?melt.data.table for more on coercion.
ggplot(d_plot_pr) + aes(x = simu_name, y = value, fill = factor(pr)) +
geom_boxplot() + theme0 +
scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 120 rows containing missing values (`stat_boxplot()`).

ggplot(d_plot_pr) + aes(x = simu_name, y = value) +
geom_boxplot() +
theme0 + facet_grid(pr ~ .) +
scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 120 rows containing missing values (`stat_boxplot()`).

ggplot(d_plot_tf) + aes(x = simu_name, y = value) +
geom_boxplot() +
theme0 + facet_grid(pr ~ .) +
scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 120 rows containing missing values (`stat_boxplot()`).

Apply the threshold of 0.0001 of left reads
# NOTE: input
d = fread("./tmp/table_no_umi_p0001_threshold_predict_rate_clustering.tsv")
d_plot_pr = melt(d, id.vars = "simu_name", measure.vars=c("pre", "rec"), variable.name = "pr", value.name = "value")
## Warning in melt.data.table(d, id.vars = "simu_name", measure.vars = c("pre", : 'measure.vars' [pre, rec, ...] are not all of the same type. By order of hierarchy, the molten data value column will be of type 'double'. All measure variables not of type 'double' will be coerced too.
## Check DETAILS in ?melt.data.table for more on coercion.
d_plot_tf = melt(d, id.vars = "simu_name", measure.vars=c("tpr", "fpr"), variable.name = "pr", value.name = "value")
## Warning in melt.data.table(d, id.vars = "simu_name", measure.vars = c("tpr", : 'measure.vars' [tpr, fpr, ...] are not all of the same type. By order of hierarchy, the molten data value column will be of type 'double'. All measure variables not of type 'double' will be coerced too.
## Check DETAILS in ?melt.data.table for more on coercion.
ggplot(d_plot_pr) + aes(x = simu_name, y = value, fill = factor(pr)) +
geom_boxplot() + theme0 +
scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 120 rows containing missing values (`stat_boxplot()`).

ggplot(d_plot_pr) + aes(x = simu_name, y = value) +
geom_boxplot() +
theme0 + facet_grid(pr ~ .) +
scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 120 rows containing missing values (`stat_boxplot()`).

ggplot(d_plot_tf) + aes(x = simu_name, y = value) +
geom_boxplot() +
theme0 + facet_grid(pr ~ .) +
scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 120 rows containing missing values (`stat_boxplot()`).

Merge threshold
d1 = fread("./tmp/table_no_umi_p0001_threshold_predict_rate_clustering.tsv")[, resolution := "res0.0001"]
d2 = fread("./tmp/table_no_umi_p00001_threshold_predict_rate_clustering.tsv")[, resolution := "res0.00001"]
d = rbindlist(list(d1, d2))
d = melt(d, id.vars = c("simu_name", "resolution"), measure.vars=c("pre", "rec"), variable.name = "pr", value.name = "value")
d$resolution %<>% factor(levels = c("res0.0001", "res0.00001"))
ggplot(d) + aes(x = simu_name, y = value, color = resolution) +
geom_boxplot() +
theme0 + facet_grid(pr ~ .) + scale_color_npg() +
scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 240 rows containing missing values (`stat_boxplot()`).
